

#Building Data-Frame for PGMS and Protest Analysis 

#Notes: 

# Loading Packages  -------------------------------------------------------
library(tidyverse)
library(dataverse)
library(bit)
library(peacesciencer)
library(countrycode)
library(stevemisc)
library(maps)
library(countrycode)
library(vdemdata)
library(janitor) 
library(wbstats)
library(miceadds)
library(sandwich)
library(lmtest)
library(stargazer)
library(texreg)
library(foreign)

#Loading all of the Dataframes / Preparing them to merge



# PGM Data (pgm_df) ----------------------------------------------------------------


Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
pgm_df <- get_dataframe_by_name(
  filename = "pgmdv2_countryyear.tab",
  dataset = "doi:10.7910/DVN/YK8L4I",
  server = "dataverse.harvard.edu"
)




# VDem Data (vdem_df) ---------------------------------------------------------------


vdem_df <- vdem 
vdem_df <- vdem_df %>%
  filter(year >= 1981 & year <= 2014)
vdem_df <- vdem_df[, c("country_name", 
                       "COWcode", 
                       "year",
                       "v2x_civlib", 
                       "e_v2x_civlib_3C", 
                       "e_v2x_civlib_4C", 
                       "e_v2x_civlib_5C")]
#better names 
vdem_df <- vdem_df |>
  rename(cow = COWcode) |>
  rename(country = country_name) |> 
  rename(vdem_civlib = v2x_civlib) |>
  filter(year >= 1981 & year <= 2014)
#adding cow codes to make merging later easier 
ccode_dta <- codelist_panel
ccode_dta <- ccode_dta %>% 
  filter(year >= 1981 & year <= 2014)
ccode_dta <- ccode_dta[, c( "year", 
                            "continent", 
                            "cown", 
                            "region", 
                            "country.name.en", 
                            "gwn", 
                            "iso3c")]

ccode_dta <- ccode_dta |>
  rename(cow = cown)
#adding in gwno 
vdem_df <- left_join(vdem_df, ccode_dta, by = c("year" = "year", 
                                                "cow" = "cow"))
vdem_df <- unique(vdem_df)







# Youth Bulge & Urban Pop Data (pop_data) --------------------------------------------



pop <- wb_search(pattern = "population age")
rm(pop)
indicators <- c("youth_bulge"  = "SP.POP.1519.MA.5Y",
                "urban_pop" = "SP.URB.TOTL.IN.ZS",
                "natl_rents" = "NY.GDP.PETR.RT.ZS", 
                "pop_density" = "EN.POP.DNST")
pop_data <- wb_data(indicators, 
                    mrv = 45) |>
  select(!iso2c) |>
  rename(year = date)
rm(indicators)

#time frame
pop_data <- pop_data |>
  filter(year >= 1981 & year <= 2014) 

#merging in ccodes 
pop_data <- pop_data |>
  left_join(ccode_dta, 
            pop_data, 
            by = c("year" = "year", 
                   "iso3c" = "iso3c"))

pop_data <- pop_data[!is.na(pop_data$cow), ] #microstates and whatnot 










# GDP and Conflict Variables (cow_df)  ---------------------------------------------


library(peacesciencer)
cow_df <- create_stateyears(system = "cow") 
cow_df <- add_sdp_gdp(cow_df)
cow_df <- add_gwcode_to_cow(cow_df)
cow_df <- add_ucdp_acd(cow_df)
cow_df <- add_sdp_gdp(cow_df)
cow_df <- add_nmc(cow_df)
cow_df <- add_rugged_terrain(cow_df)

#Time since conflict variable 
library(stevemisc)
cow_df <- ps_btscs(cow_df, ucdpongoing, year, ccode)
summary(cow_df$spell)
cow_df <- cow_df |>
  rename(peace_years = spell)
cow_df <- cow_df |>
  filter(year >= 1981 & year <= 2014) 










# PTS Data (PTS_2024)  ---------------------------------------------------------------

pts_df <- "https://www.politicalterrorscale.org/Data/Files/PTS-2024.RData"
load(url(pts_df))

PTS_2024 <- PTS_2024 |>
  filter(Year > 1980 & Year < 2015) 

PTS_2024 <- PTS_2024 |> 
  rename(year = Year) |>
  rename(cowcode = COW_Code_N)

rm(pts_df)


# Coup Data(coups)---------------------------------------------------------------
coups <- read.delim('http://www.uky.edu/~clthyn2/coup_data/powell_thyne_coups_final.txt')
# Assuming coups is your dataframe
coups <- coups |>
  mutate(coup_attempt = 1)
coups <- select(coups, -c("ccode_gw", 
                          "ccode_polity", 
                          "month", 
                          "day", 
                          "version"))

#correct dataframe 
coups <- coups |>
  filter(year >= 1981 & year <= 2014)





# EPR Data ----------------------------------------------------------------

epr <- read.delim("https://icr.ethz.ch/data/epr/core/EPR-2021.txt")
View(epr)

#I need to create an indicator for the level of ethnic inequality by state
#this is going to be a bit of a pain
epr <- epr |>
  rename(country = statename)

#need to first make group year the UOA
expanded_epr <- epr |>
  group_by(group, gwid) |>
  summarise(from = min(from), to = max(to)) |>
  uncount(to - from + 1, .id = "index") |>
  mutate(year = from + index - 1) |>
  select(-from, -to, -index) |>
  left_join(epr, by = c("group", "gwid"))

View(expanded_epr)


#Now we need to sort by country year, and add indicators for ethnic exclusion
# Create the new variable ethnic_exclus based on status
expanded_epr$ethnic_exclus <- ifelse(expanded_epr$status %in% c("DISCRIMINATED", "POWERLESS"), 1, 0)
hist(expanded_epr$ethnic_exclus)


# Group by country and year and collapse observations
collapsed_epr <- expanded_epr |>
  group_by(gwid, year) |>
  summarise(
    ethnic_exclus = as.numeric(any(ethnic_exclus == 1)),
    total_size = sum(size),
    num_groups = sum(ethnic_exclus)
  )


#I need an interaction term between presence and group size 
collapsed_epr$size_exclu_int <- collapsed_epr$num_groups * collapsed_epr$total_size
hist(collapsed_epr$size_exclu_int)


#checking everything out 
hist(collapsed_epr$ethnic_exclus)
hist(collapsed_epr$num_groups)
rm(expanded_epr, epr)
View(collapsed_epr)



#set the right timeframe 
collapsed_epr <- collapsed_epr |>
  filter(year < 2015,
         year > 1980)




# NAVCO Data (protest_df) --------------------------------------------------------------

Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
navco_13 <- get_dataframe_by_doi(
  filedoi = "doi:10.7910/DVN/ON9XND/XRA6WM",
  server = "dataverse.harvard.edu"
)

#NAVCO UOA is the campaign, I need to make it country year to be able to work with it
#I also need to make sure to save the dummy variables for violent and non-violent dissent
navco_13 <- navco_13 |>
  rename(country = LOCATION)
#getting the country codes 
library(countrycode)
ccodedta <- codelist_panel
ccodedta <- ccodedta |>
  filter(year >= 1899 & year <= 2014)
ccodedta <- ccodedta[, c( "year", 
                          "continent", 
                          "cown", 
                          "region", 
                          "country.name.en", 
                          "iso3c", 
                          "gwn")]
ccodedta <- na.omit(ccodedta[!is.na(ccodedta$gwn), ])

#Adding ccodes to the navco dataset
navco_13 <- navco_13 |>
  mutate(cown = countrycode(country, "country.name", 
                            "cown", 
                            warn = TRUE))

countries_with_na <- navco_13 %>%
  filter(is.na(cown)) %>%
  select(country)
rm(countries_with_na)

navco_13 <- navco_13 %>%
  filter(!is.na(cown))

#Chaning UOA to protest year 
navco_13 <- navco_13 |>
  rowwise() |>
  mutate(year = list(seq(BYEAR, 
                         EYEAR))) |>
  unnest(year) |>
  ungroup()

#Merging NAVCO into the country year dataset created by COW
protest_df <- left_join(ccodedta, 
                        navco_13 , 
                        by = c("year" = "BYEAR", 
                              "cown" = "cown"))

rm(ccodedta)
rm(ccode_dta)
rm(navco_13)

#Now I need to collapse by country year. 
# Aggregate to country-year with binary indicators and retain LOCATION
navco_13_country_year <- protest_df |>
  group_by(cown, year) |>
  summarise(
    VIOL = as.integer(any(VIOL == 1)),          # Set to 1 if any VIOL event occurred
    NONVIOL = as.integer(any(NONVIOL == 1)),    # Set to 1 if any NONVIOL event occurred
    ONGOING = as.integer(any(ONGOING == 1)),    # Set to 1 if any event was ongoing
  ) |>
  ungroup()

# View the result
head(navco_13_country_year)

protest_df <- navco_13_country_year

#filling in missing data 
protest_df <- protest_df |>
  mutate(NONVIOL = ifelse(is.na(NONVIOL), 0, NONVIOL)) |>
  mutate(VIOL = ifelse(is.na(VIOL), 0, VIOL)) |>
  mutate(ONGOING = ifelse(is.na(ONGOING), 0, ONGOING))

#time since for non-violence dissent 
protest_df <- ps_btscs(protest_df, 
                       NONVIOL, 
                       year, 
                       cown)


protest_df <- protest_df |>
  rename(protest_peace_years = spell)
protest_df <- protest_df |>
  mutate(pp_years_sq = protest_peace_years^2) |>
  mutate(pp_years_cb = protest_peace_years^3)

#time since for violent dissent 
protest_df <- ps_btscs(protest_df, 
                       VIOL, 
                       year, 
                       cown)

protest_df <- protest_df |>
  rename(vd_peace_years = spell)
protest_df <- protest_df |>
  mutate(vd_years_sq = vd_peace_years^2) |>
  mutate(vd_years_cb = vd_peace_years^3)



#now before I go any further, I want to filter my DF
protest_df <- protest_df |>
  filter(year >= 1981 & year <= 2014)








# Merging Dataframes  -----------------------------------------------------

#starting with COW
full_df <- left_join(cow_df,
                     protest_df,
                     by = c("year" = "year", 
                            "ccode" = "cown"))
rm(cow_df, protest_df)

#merge 2
full_df <- left_join(full_df,
                     pgm_df, 
                     by = c("year" = "year", 
                            "gwcode" = "gwno"))

rm(pgm_df)

#merge 3
full_df <- left_join(full_df,
                     pop_data, 
                     by = c("year" = "year", 
                            "ccode" = "cow"))

rm(pop_data)

#merge 4
full_df <- left_join(full_df,
                     PTS_2024, 
                     by = c("year" = "year", 
                            "ccode" = "cowcode"))
full_df <- unique(full_df) #warning about matches / removing potential problems
rm(PTS_2024)

#Merge 5
full_df <- left_join(full_df,
                     vdem_df, 
                     by = c("year" = "year", 
                            "ccode" = "cow"))
rm(vdem_df)

#merge 6 
full_df <- left_join(full_df,
                     coups, 
                     by = c("year" = "year", 
                            "ccode" = "ccode"))
rm(coups)

full_df <- unique(full_df) 

#merge #7 
full_df <- left_join(full_df,
                     collapsed_epr, 
                     by = c("year" = "year", 
                            "ccode" = "gwid"))

rm(collapsed_epr)









# Cleaning full_df --------------------------------------------------------

#temporal domain 
full_df <- full_df |>
  filter(year > 1980 & year < 2015)

#checking for and removing repeat observations 
full_df <- full_df %>% distinct(ccode, year, .keep_all = TRUE)











# Creating / Altering Variables  ------------------------------------------

#fix the coup variable 
full_df <- full_df |>
  mutate(coup = ifelse(is.na(coup_attempt), 0, coup_attempt))

#logs
full_df <- full_df |>
  mutate(vdem_civlib_sq = vdem_civlib^2) |>
  mutate(log_pop = log(wbpopest)) |>
  mutate(pop_sq = wbpopest^2) 

#lags 
full_df <- full_df |>
  group_by(ccode) |>
  mutate(lag_presence = lag(presence, 1)) |> 
  mutate(lag_count = lag(presence_count, 1)) |>
  mutate(lag_govt_pgm = lag(presence_govformed, 1)) |>
  mutate(lag_not_govt_pgm = lag(presence_notgovformed, 1)) |>
  mutate(lag_coup = lag(coup, 1))

#cold war dummy
full_df$post_cw <- ifelse(full_df$year > 1991, 1, 0)

#General PGM PTS interaction term 
full_df$pgm_pts_int <- full_df$presence * full_df$PTS_A

#Informal PGM PTS interaction term 
full_df$inf_pgm_pts_int <- full_df$presence_informal * full_df$PTS_A

#Semi-Official PGM PTS interaction term 
full_df$so_pgm_pts_int <- full_df$presence_semiofficial * full_df$PTS_A

#Lag of the Interaction terms 
full_df <- full_df |>
  group_by(ccode) |>
  mutate(lag_presence_int = lag(pgm_pts_int, 1)) |>
  mutate(lag_inf_int = lag(inf_pgm_pts_int, 1)) |>
  mutate(lag_so_int = lag(so_pgm_pts_int, 1)) 





# Saving the dataframe  ---------------------------------------------------

save(full_df, file = "/Users/benjaminleo/Library/Mobile Documents/com~apple~CloudDocs/PGMs & Protest Empirics/PGM&Protest Paper/protest dataframe")
write.dta(full_df, "pgm_protest_full.dta")








# Descriptive Plots -------------------------------------------------------

#descriptive plot function 
descrip_plot <- function(data, # expects data.frame or tbl
                         variables = NULL, # names of variables to be plotted, if NULL plot all
                         print_plots = TRUE,# should plots be immediately plotted?
                         return_plots = FALSE,
                         p_black = "black",
                         p_fill = "skyblue") {  # should plots be saved in a list to be available later?
  if (is.null(variables)) {
    variables <- names(data)
  }
  plot_list <- vector(mode = "list", length = length(variables))
  names(plot_list) <- variables
  for (variable in variables) {
    if (length(na.omit(unique(data[[variable]]))) == 1) {
      next
    }
    p <- ggplot(data, aes(x = .data[[variable]])) 
    if (is.numeric(data[[variable]])) {
      if (length(na.omit(unique(data[[variable]]))) < 30) {
        bins <- length(na.omit(unique(data[[variable]])))
      } else {
        bins <- 30
      }
      p <-  p + geom_histogram(bins = bins, 
                               color = p_black, 
                               fill = p_fill, 
                               alpha = 0.7) 
    } else {
      p <-  p + geom_bar(color = p_black, 
                         fill = p_fill, 
                         alpha = 0.7) 
    }
    p <- p + labs(title = paste("Distribution of", variable),
                  x = variable,
                  y = "Frequency") +
      theme_minimal() 
    if (print_plots) {
      print(p)
    }
    plot_list[[variable]] <- p
  }
  if (return_plots) {
    return(plot_list)
  }
}

descrip_plot(full_df)
duplicate_rows <- duplicated(full_df[c("ccode", "year")])
full_df <- full_df[!duplicate_rows, ]
descrip_plot(full_df)

#dataframe for this 
collapsed_data <- full_df |>
  group_by(ccode) |>
  summarise(
    VIOL_count = sum(VIOL, 
                     na.rm = TRUE),
    NONVIOL_count = sum(NONVIOL, 
                        na.rm = TRUE)
  )




# Get world map data
world_map <- map_data("world") 

#Adding gwno to world map data 
world_map$gwno <- countrycode(world_map$region, 
                              origin = "country.name",
                              destination = "gwn")

#merging
merged_df <- left_join(world_map, 
                       collapsed_data, 
                       by = c("gwno" = "ccode"))


#plot
ggplot() +
  geom_polygon(data = merged_df, 
               aes(x = long,
                   y = lat, 
                   group = group, 
                   fill = NONVIOL_count), 
               color = "black") +
  scale_fill_gradient(name = "Number of NVMPCs", 
                      low = "white", 
                      high = "black", 
                      na.value = "white") +
  coord_fixed() +
  theme_void() + 
  labs(
    title = "Global Distribution of NVMPCs (1981-2014)"
  )

#plot
ggplot() +
  geom_polygon(data = merged_df, 
               aes(x = long,
                   y = lat, 
                   group = group, 
                   fill = VIOL_count), 
               color = "black") +
  scale_fill_gradient(name = "Number of VMCs", 
                      low = "white", 
                      high = "black", 
                      na.value = "white") +
  coord_fixed() +
  theme_void() + 
  labs(
    title = "Global Distribution of VMCs (1981-2014)"
  )





